## Read in data
library(readxl)
demo_df <- read_excel("Humana Case 2020 Q1 -- Data.xlsx", 
     sheet = "Demographics", col_types = c("text", 
         "numeric", "numeric", "numeric", 
         "numeric", "text", "text", "text", 
         "numeric", "numeric", "text", "text"))

claims_df <- read_excel("Humana Case 2020 Q1 -- Data.xlsx", 
    sheet = "Claims")

er_df <- read_excel("Humana Case 2020 Q1 -- Data.xlsx", 
    sheet = "ER Utilization")

pcp_df <- read_excel("Humana Case 2020 Q1 -- Data.xlsx", 
    sheet = "PCP Visits")

hospital_df <- read_excel("Humana Case 2020 Q1 -- Data.xlsx", 
    sheet = "Hospital Admissions")

rx_df <- read_excel("Humana Case 2020 Q1 -- Data.xlsx", 
    sheet = "Rx Utilization")

Data Preparation

## Format data
claims = claims_df %>% 
        pivot_longer(-id, names_to = "time", values_to = "agg_claim") %>% 
        transmute(id = as.character(id),
               time = excel_numeric_to_date(as.numeric(time), 
                                            date_system = "modern" ),
               agg_claim)
        
er = er_df %>% 
        pivot_longer(-id, names_to = "time", values_to = "count") %>% 
        transmute(id = as.character(id),
               time = excel_numeric_to_date(as.numeric(time), 
                                            date_system = "modern" ),
               er_count = count)

pcp = pcp_df %>% 
        pivot_longer(-id, names_to = "time", values_to = "count") %>% 
        transmute(id = as.character(id),
               time = excel_numeric_to_date(as.numeric(time), 
                                            date_system = "modern" ),
               pcp_count = count)

hospital = hospital_df %>% 
        pivot_longer(-id, names_to = "time", values_to = "count") %>% 
        transmute(id = as.character(id),
               time = excel_numeric_to_date(as.numeric(time), 
                                            date_system = "modern" ),
               hospital_count = count)

rx = rx_df %>% 
        pivot_longer(-id, names_to = "time", values_to = "count") %>% 
        transmute(id = as.character(id),
               time = excel_numeric_to_date(as.numeric(time), 
                                            date_system = "modern" ),
               rx_count = count)
## Check counts
er %>% group_by(er_count) %>% 
    count() %>% arrange()
## # A tibble: 4 x 2
## # Groups:   er_count [4]
##   er_count      n
##      <dbl>  <int>
## 1        0 340056
## 2        1  12684
## 3        2    250
## 4        3      2
pcp %>% group_by(pcp_count) %>% 
    count() %>% arrange()
## # A tibble: 5 x 2
## # Groups:   pcp_count [5]
##   pcp_count      n
##       <dbl>  <int>
## 1         0 317878
## 2         1  33273
## 3         2   1767
## 4         3     72
## 5         4      2
hospital %>% group_by(hospital_count) %>% 
    count() %>% arrange()
## # A tibble: 3 x 2
## # Groups:   hospital_count [3]
##   hospital_count      n
##            <dbl>  <int>
## 1              0 345720
## 2              1   7210
## 3              2     62
rx %>% group_by(rx_count) %>% 
    count() %>% arrange()
## # A tibble: 6 x 2
## # Groups:   rx_count [6]
##   rx_count      n
##      <dbl>  <int>
## 1        0 286532
## 2        1  59431
## 3        2   6515
## 4        3    485
## 5        4     28
## 6        5      1
claims %>% group_by(agg_claim) %>% 
    count() %>% arrange() # assume mistyped a minus sign in front
## # A tibble: 148,856 x 2
## # Groups:   agg_claim [148,856]
##    agg_claim      n
##        <dbl>  <int>
##  1 -8.05e+ 2      1
##  2 -6.64e+ 2      1
##  3 -3.21e+ 2      1
##  4 -1.00e-13      1
##  5  0.       138787
##  6  3.13e+ 1      1
##  7  3.30e+ 1      1
##  8  3.48e+ 1      1
##  9  3.52e+ 1      1
## 10  3.57e+ 1      1
## # … with 148,846 more rows
## Check: each individual either get one or none CRD
demo_df %>% mutate(test_sum = transport+fin_assis+lonely+food_insec) %>% 
        filter(test_sum > 1)
## # A tibble: 0 x 13
## # … with 13 variables: id <chr>, transport <dbl>, fin_assis <dbl>,
## #   lonely <dbl>, food_insec <dbl>, region <chr>, rural <chr>,
## #   is_lowincome <chr>, age <dbl>, chronic_count <dbl>, gender <chr>,
## #   mth_out <chr>, test_sum <dbl>
## Check: region
demo_df %>% group_by(region) %>% 
    count() %>% arrange(n) # unknown small, ignore
## # A tibble: 6 x 2
## # Groups:   region [6]
##   region        n
##   <chr>     <int>
## 1 Unknown       7
## 2 Northwest   777
## 3 Southwest   857
## 4 Northeast  1480
## 5 Central    1925
## 6 Southeast  2308
## Check: rural
demo_df %>% group_by(rural) %>% 
    count() %>% arrange(n) # unknown small, ignore
## # A tibble: 5 x 2
## # Groups:   rural [5]
##   rural          n
##   <chr>      <int>
## 1 Unknown        7
## 2 Rural        605
## 3 Semi-Rural  1414
## 4 Suburban    2283
## 5 Urban       3045
## Check: is_lowincome
demo_df %>% group_by(is_lowincome) %>% 
    count() %>% arrange(n) # inconsistant label, need to fix
## # A tibble: 5 x 2
## # Groups:   is_lowincome [5]
##   is_lowincome     n
##   <chr>        <int>
## 1 YES              1
## 2 No               3
## 3 Yes              3
## 4 Y             3048
## 5 N             4299
## Check: age
demo_df %>% group_by(age) %>% count() %>% filter(age<5) %>% arrange(age) # need to correct negative values, assume mistyped minues sign
## # A tibble: 14 x 2
## # Groups:   age [14]
##      age     n
##    <dbl> <int>
##  1   -25     1
##  2   -21     1
##  3   -18     1
##  4   -17     1
##  5   -14     1
##  6   -12     1
##  7   -11     1
##  8    -8     1
##  9    -7     1
## 10    -5     1
## 11    -2     2
## 12    -1     1
## 13     2     1
## 14     4     1
demo_df %>% group_by(age) %>% count() %>% filter(age>95) %>% arrange(age) # need to correct >100 values, assume mistyped a 1 in the front
## # A tibble: 10 x 2
## # Groups:   age [10]
##      age     n
##    <dbl> <int>
##  1    98     2
##  2   100     1
##  3   101     1
##  4   104     1
##  5   151     1
##  6   160     1
##  7   165     1
##  8   167     1
##  9   171     2
## 10   175     1
## Check: chronic_count
demo_df %>% group_by(chronic_count) %>% count() %>% arrange(chronic_count) # assume mistyped a 1 in the front
## # A tibble: 10 x 2
## # Groups:   chronic_count [10]
##    chronic_count     n
##            <dbl> <int>
##  1             0  1299
##  2             1  1503
##  3             2  1812
##  4             3  1477
##  5             4   844
##  6             5   336
##  7             6    71
##  8             7     9
##  9            10     1
## 10            11     2
## Check: gender
demo_df %>% group_by(gender) %>% count() %>% arrange(n) # will just ignore other than M, F
## # A tibble: 5 x 2
## # Groups:   gender [5]
##   gender     n
##   <chr>  <int>
## 1 W          1
## 2 U          2
## 3 X          2
## 4 M       3553
## 5 F       3796
demo = demo_df %>% 
    mutate(is_lowincome = demo_df$is_lowincome %in% c("Y","Yes","YES"),
           age = ifelse(age>110, age-100, abs(age)),
           mth_out = decimal_date(mdy(paste(mth_out,"1","2017"))),
           chronic_count = ifelse(chronic_count>=10,
                                  chronic_count-10,
                                  chronic_count)) %>% 
    filter(region != "Unknown" & rural != "Unknown" & gender %in% c("M","F"))

## Add a column indicate which CRD an individual received
crd = NA
for(i in 1:length(demo$id)){
        crd[i]=0
        if(demo$transport[i] == 1){
                crd[i] = 1
        }
        if(demo$fin_assis[i] == 1){
                crd[i] = 2
        }
         if(demo$lonely[i] == 1){
                crd[i] = 3
         }
         if(demo$food_insec[i] == 1){
                crd[i] = 4
         }
}
demo = demo %>% cbind(crd)

## Check: all NAs represent people from the control group
demo %>% filter(is.na(mth_out),
                crd !=0)
##  [1] id            transport     fin_assis     lonely        food_insec   
##  [6] region        rural         is_lowincome  age           chronic_count
## [11] gender        mth_out       crd          
## <0 rows> (or 0-length row.names)
## Join utilization and claim tables
loss = er %>% 
    inner_join(hospital, by = c("id"="id","time"="time")) %>% 
    inner_join(pcp, by = c("id"="id","time"="time")) %>% 
    inner_join(rx, by = c("id"="id","time"="time")) %>% 
    mutate(total_count = er_count+hospital_count+pcp_count+rx_count) %>% 
    inner_join(claims, by = c("id"="id","time"="time")) %>% 
    mutate(agg_claim = abs(agg_claim),
           avg_claim = ifelse(agg_claim*total_count!=0,agg_claim/total_count,0))

## Convert date to decimal year
loss = loss %>% mutate(time = decimal_date(time))

## Reduction
demo = demo %>% 
    select(id,region,rural,is_lowincome,age,chronic_count,gender,mth_out,crd)
remove(claims,claims_df,demo_df,er,er_df,hospital,hospital_df,pcp,pcp_df,rx,rx_df,crd,i)

## Join demo(features) with loss(responses)
impact = demo %>% inner_join(loss, by = "id")

## Add a column to indicate if row is after_crd
impact = impact %>% 
    mutate(after_crd = ifelse(!is.na(mth_out),time >= mth_out,NA))

#impact = impact %>% select(-mth_out)

Exploratory Data Analysis

## Pair-wise relationship in demo
demo %>% select(-id) %>% ggpairs()

## Average utilization (aka # of claims) across the entire population over time
loss %>% group_by(time) %>% 
    summarize(a=mean(er_count),b=mean(hospital_count),
              c=mean(pcp_count),d=mean(rx_count),e=mean(total_count))%>% 
    ggplot() +
    geom_line(aes(x=time,y=a),color="red",size=1.5)+
    geom_line(aes(x=time,y=b),color="blue",size=1.5)+
    geom_line(aes(x=time,y=c),color="green",size=1.5)+
    geom_line(aes(x=time,y=d),color="black",size=1.5)+
    geom_line(aes(x=time,y=e),color="magenta",size=1.5)+
    geom_smooth(aes(x=time,y=a),color="black",method="lm",size=0.5)+
    geom_smooth(aes(x=time,y=b),color="black",method="lm",size=0.5)+
    geom_smooth(aes(x=time,y=c),color="black",method="lm",size=0.5)+
    geom_smooth(aes(x=time,y=d),color="black",method="lm",size=0.5)+
    geom_smooth(aes(x=time,y=e),color="black",method="lm",size=0.5)

## Average utilization (aka # of claims) across the entire population over time
loss %>% group_by(time) %>% 
    summarize(avg_agg_claim=mean(agg_claim),avg_avg_claim=mean(avg_claim))%>% 
    ggplot() +
    geom_line(aes(x=time,y=avg_agg_claim),color="red",size=1.5)+
    geom_line(aes(x=time,y=avg_avg_claim),color="blue",size=1.5)+
    geom_smooth(aes(x=time,y=avg_agg_claim),color="black",method="lm",size=0.5)+
    geom_smooth(aes(x=time,y=avg_avg_claim),color="black",method="lm",size=0.5)
out = sort(unique(impact$mth_out))
before=list()
after=list()
for(i in 1:length(out)){
        tmp = impact %>% filter(mth_out==out[i])
        before[[i]] = tmp %>% filter(after_crd==F)
        #write.csv(before[[i]],paste('before_received_crd_in_month_',i,'.csv'))
        after[[i]] = tmp %>% filter(after_crd==T)
        #write.csv(after[[i]],paste('after_received_crd_in_month_',i,'.csv'))
}
not_control = impact %>% filter(!is.na(mth_out))
control = impact %>% filter(is.na(mth_out))
#write.csv(control,paste('control.csv'))

Misc.

tmp = control

log_claim = log(tmp$agg_claim)

#tmp %>% ggplot() + geom_density(aes(x=log_claim))
#tmp %>% ggplot() + geom_bar(aes(x=total_count))
tmp %>% ggplot() + geom_point(aes(x=time,y=log_claim,color=factor(er_count)),alpha=0.1)

tmp %>% ggplot() + geom_point(aes(x=time,y=log_claim,color=factor(hospital_count)),alpha=0.1)

tmp %>% ggplot() + geom_point(aes(x=time,y=log_claim,color=factor(rx_count)),alpha=0.3)

tmp %>% ggplot() + geom_point(aes(x=time,y=log_claim,color=factor(pcp_count)),alpha=0.3)

ggplot(data = tmp) +
    geom_point(mapping = aes(x = time, y = log_claim),alpha=0.1) + 
    facet_grid(rows = vars(er_count), cols = vars(hospital_count))

tmp = control

log_claim = log(tmp$agg_claim)

#tmp %>% ggplot() + geom_density(aes(x=log_claim))
#tmp %>% ggplot() + geom_bar(aes(x=total_count))


tmp %>% filter(er_count>0) %>% 
    ggplot() + geom_point(aes(x=time,y=log(agg_claim),color=factor(er_count)),alpha=0.3) + ylim(4,10)
tmp %>% filter(hospital_count>0) %>% 
    ggplot() + geom_point(aes(x=time,y=log(agg_claim),color=factor(hospital_count)),alpha=0.3)+ ylim(4,10)
tmp %>% filter(rx_count>0) %>% 
    ggplot() + geom_point(aes(x=time,y=log(agg_claim),color=factor(rx_count)),alpha=0.3)+ ylim(4,10)
tmp %>% filter(pcp_count>0) %>% 
    ggplot() + geom_point(aes(x=time,y=log(agg_claim),color=factor(pcp_count)),alpha=0.3)+ ylim(4,10)

ggplot(data = tmp) +
    geom_point(mapping = aes(x = time, y = log_claim),alpha=0.1) + 
    facet_grid(rows = vars(er_count), cols = vars(rx_count))
ggplot(data = tmp) +
    geom_point(mapping = aes(x = time, y = log_claim),alpha=0.1) + 
    facet_grid(rows = vars(rx_count), cols = vars(er_count))
tmp = control
er_claim=0
hospital_claim=0
pcp_claim=0
rx_claim=0
er_claim = ifelse((er_count>0)&(hospital_count*pcp_count*rx_count==0),
                                       agg_claim/er_count,
                                       avg_claim)
hospital_claim = ifelse((hospital_count>0)&(er_count*pcp_count*rx_count==0),
                                       agg_claim/hospital_count,
                                       avg_claim)
pcp_claim = ifelse((pcp_count>0)&(er_count*hospital_count*rx_count==0),
                                       agg_claim/pcp_count,
                                       avg_claim)
rx_claim = ifelse((rx_count>0)&(hospital_count*pcp_count*pcp_count==0),
                                       agg_claim/rx_count,
                                       avg_claim)
tmp = tmp %>% cbind(er_claim,hospital_claim,pcp_claim,rx_claim)
log_claim = log(tmp$agg_claim)
#sort(((tmp$er_claim)),decreasing =T)
tmp %>% ggplot() + geom_point(aes(x=time,y=log(er_claim),color=factor(er_count)),alpha=0.1)+ ylim(4,10)
tmp %>% ggplot() + geom_point(aes(x=time,y=log(hospital_claim),color=factor(hospital_count)),alpha=0.1)+ ylim(4,10)
tmp %>% ggplot() + geom_point(aes(x=time,y=log(rx_claim),color=factor(rx_count)),alpha=0.3)+ ylim(4,10)
tmp %>% ggplot() + geom_point(aes(x=time,y=log(pcp_claim),color=factor(pcp_count)),alpha=0.3)+ ylim(4,10)
attach(control)
tmp = control
model1 = lm(log(agg_claim+1)~er_count*hospital_count*rx_count*pcp_count+
                time+gender+age*factor(chronic_count))
summary(model1)
tmp = tmp %>% cbind(fit = fitted(model1),res = residuals(model1))

ggplot(data=tmp) + geom_point(mapping=aes(x=fit,y=res,color=region),alpha=0.5)

ggplot(data=tmp) + geom_boxplot(mapping=aes(x=factor(chronic_count),y=res))
ggplot(data=tmp) + geom_boxplot(mapping=aes(x=factor(age),y=res))

ggplot(data=tmp,mapping=aes(x=age,y=res)) + geom_point(alpha=0.1)
ggplot(data=tmp,mapping=aes(x=factor(chronic_count),y=res)) + geom_jitter(alpha=0.1)


ggplot(data = tmp) +
    geom_point(mapping = aes(x = fit, y = res),alpha=0.1) + 
    facet_grid(rows = vars(cut(age,c(0,20,40,60,80,100,120))), cols = vars(factor(chronic_count)))


ggplot(data=tmp) + geom_jitter(mapping=aes(x=age,y=factor(chronic_count)),alpha=0.1)

ggplot(data=tmp) + geom_density(aes(x=age))
ggplot(data=tmp) + geom_bar(aes(x=factor(chronic_count)))
attach(impact)
tmp = impact
sin_term = sin(2*pi*impact$time)
cos_term = cos(2*pi*impact$time)
time_sq <- impact$time^2 # will use later
time_cu <- impact$time^3 # will use later

## Frequency model
model_total_count = glm(total_count ~ sin_term + cos_term + time + 
                           region + rural + is_lowincome + age + 
                           chronic_count + factor(crd),
                        family = poisson)
model1 = model_total_count
summary(model1)
tmp = tmp %>% cbind(fit = fitted(model1),res = residuals(model1))
#ggplot(data=tmp) + geom_point(mapping=aes(x=fit,y=res,color=factor(crd)))
#ggplot(data=tmp) + geom_point(mapping=aes(x=fit,y=res,color=factor(region)))
impact %>% 
    group_by(after_crd,time) %>% 
    summarize(monthly_total_count = mean(total_count),
              monthly_agg_claim = mean(agg_claim),
              monthly_avg_claim = mean(avg_claim)) %>%
    ggplot(aes(x=time,y=monthly_total_count,color=after_crd))+
    geom_point()+
    geom_smooth(method="lm")

impact %>% 
    group_by(after_crd,time) %>% 
    summarize(monthly_total_count = mean(total_count),
              monthly_agg_claim = mean(agg_claim),
              monthly_avg_claim = mean(avg_claim)) %>%
    ggplot(aes(x=time,y=monthly_agg_claim,color=after_crd))+
    geom_point()+
    geom_smooth(method="lm")

impact %>% 
    group_by(after_crd,time) %>% 
    summarize(monthly_total_count = mean(total_count),
              monthly_agg_claim = mean(agg_claim),
              monthly_avg_claim = mean(avg_claim)) %>%
    ggplot(aes(x=time,y=monthly_avg_claim,color=after_crd))+
    geom_point()+
    geom_smooth(method="lm")
a = impact %>% 
        filter(after_crd==F) %>% 
        group_by(mth_out,time) %>% 
        summarize(monthly_total_count = mean(total_count),
                  monthly_agg_claim = mean(agg_claim),
                  monthly_avg_claim = mean(avg_claim)) 
b = impact %>% 
        filter(after_crd==T) %>% 
        group_by(mth_out,time) %>% 
        summarize(monthly_total_count = mean(total_count),
                  monthly_agg_claim = mean(agg_claim),
                  monthly_avg_claim = mean(avg_claim)) 

c = impact %>% 
        filter(is.na(after_crd)) %>% 
        group_by(mth_out,time) %>% 
        summarize(monthly_total_count = mean(total_count),
                  monthly_agg_claim = mean(agg_claim),
                  monthly_avg_claim = mean(avg_claim)) 

ggplot(mapping = aes(x=time,y=monthly_total_count,color=factor(mth_out)))+
        #geom_point(data=a)+
        geom_smooth(data=a,method="lm",se=F)+
        #geom_point(data=b)+
        geom_smooth(data=b,method="lm",se=F)+
        geom_smooth(data=c,se=T)
not_control = impact %>% filter(!is.na(mth_out))
control = impact %>% filter(is.na(mth_out))
signif_level = 0.05
test_type = "greater"

### Raw comparison
tmp = not_control %>% group_by(crd,after_crd,id) %>% 
        summarize(avg_mthly_claim = mean(agg_claim)) %>% 
        spread(key=after_crd,value=avg_mthly_claim) %>% ungroup()

is_effective=tmp$`TRUE`<tmp$`FALSE`

tmp = tmp %>% cbind(eff=is_effective)
x = tmp %>% group_by(crd) %>%
        summarize(n_eff=sum(eff),
                  n_not_eff=sum(!eff),
                  n=n(),
                  eff_prop=mean(eff))

## Test for Normality
tmp = not_control %>% group_by(crd, after_crd, id) %>% 
        summarize(avg_mthly_claim = mean(agg_claim)) %>% 
        spread(key = after_crd, value = avg_mthly_claim) %>% ungroup()

diff = tmp$`FALSE` - tmp$`TRUE`
shapiro.test(diff) #small p-value, not Normal, note sample size big
ks.test(diff,"pnorm") #small p-value, not Normal, note sample size big
ggqqplot(diff) #looking at qqplot and density, pretty Normal
ggplot()+geom_density(aes(x=diff))

## Test for significance in difference

#paired t test - assume Normal
not_control %>% group_by(crd, 
                         after_crd, id) %>% 
        summarize(avg_mthly_claim = mean(agg_claim)) %>% 
        spread(key = after_crd, value = avg_mthly_claim) %>% 
        ungroup() %>% group_by(crd) %>% 
        do(tidy(t.test(.$`FALSE`, .$`TRUE`, data=., 
               alternative = test_type, paired = TRUE))) %>% 
        mutate(is_eff = p.value<signif_level) %>% 
        select(crd, 
               is_eff) %>% data.frame(x) 
        
#wilcoxon signed-rank test - not Normal
not_control %>% group_by(crd, 
                         after_crd, id) %>% 
        summarize(avg_mthly_claim = mean(agg_claim)) %>% 
        spread(key = after_crd, value = avg_mthly_claim) %>% 
        ungroup() %>% group_by(crd) %>% 
        do(tidy(wilcox.test(.$`FALSE`, .$`TRUE`, data=., 
                       alternative = test_type, paired = TRUE))) %>% 
        mutate(is_eff = p.value<signif_level) %>% 
        select(crd, 
               is_eff) %>% data.frame(x) 
tmp = not_control %>% group_by(age,gender,chronic_count,
                               is_lowincome,rural,region,crd,
                               after_crd,id) %>% 
    summarize(avg_mthly_claim = mean(agg_claim)) %>% 
    spread(key=after_crd,value=avg_mthly_claim) %>% ungroup()

is_effective=tmp$`TRUE`<tmp$`FALSE`
tmp = tmp %>% cbind(eff=is_effective)

x = tmp %>% group_by(age,gender,chronic_count,
                 is_lowincome,rural,region,crd) %>%
    summarize(n_eff=sum(eff),
              n=sum(eff+!eff),
              eff_prop=mean(eff))
signif_level = 0.05
test_type = "greater"

### Raw comparison
tmp = not_control %>% group_by(age,gender,chronic_count,
                 is_lowincome,rural,region,crd,
                 after_crd,id) %>% 
        summarize(avg_mthly_claim = mean(agg_claim)) %>% 
        spread(key=after_crd,value=avg_mthly_claim) %>% ungroup()

is_effective=tmp$`TRUE`<tmp$`FALSE`

tmp = tmp %>% cbind(eff=is_effective)
x = tmp %>% group_by(age,gender,chronic_count,
                 is_lowincome,rural,region,crd) %>%
        summarize(n_eff=sum(eff),
                  n_not_eff=sum(!eff),
                  n=n(),
                  eff_prop=mean(eff))

## Test for Normality
tmp = not_control %>% group_by(age,gender,chronic_count,
                 is_lowincome,rural,region,crd,
                 after_crd, id) %>% 
        summarize(avg_mthly_claim = mean(agg_claim)) %>% 
        spread(key = after_crd, value = avg_mthly_claim) %>% ungroup()

diff = tmp$`FALSE` - tmp$`TRUE`
shapiro.test(diff) #small p-value, not Normal, note sample size big
ks.test(diff,"pnorm") #small p-value, not Normal, note sample size big
ggqqplot(diff) #looking at qqplot and density, pretty Normal
ggplot()+geom_density(aes(x=diff))

## Test for significance in difference

#paired t test - assume Normal
not_control %>% group_by(age,gender,chronic_count,
                 is_lowincome,rural,region,crd,
                 after_crd, id) %>% 
        summarize(avg_mthly_claim = mean(agg_claim)) %>% 
        spread(key = after_crd, value = avg_mthly_claim) %>% 
        ungroup() %>% group_by(age,gender,chronic_count,
                 is_lowincome,rural,region,crd) %>% 
        do(tidy(t.test(.$`FALSE`, .$`TRUE`, data=., 
               alternative = test_type, paired = TRUE))) %>% 
        mutate(is_eff = p.value<signif_level) %>% 
        select(age,gender,chronic_count,
               is_lowincome,rural,region,crd,
               is_eff) %>% data.frame(x) 
        
#wilcoxon signed-rank test - not Normal
x = not_control %>% group_by(age,gender,chronic_count,
                         is_lowincome,rural,region,crd, 
                         after_crd, id) %>% 
        summarize(avg_mthly_claim = mean(agg_claim)) %>% 
        spread(key = after_crd, value = avg_mthly_claim) %>% 
        ungroup() %>% group_by(age,gender,chronic_count,
                 is_lowincome,rural,region,crd) %>% 
        do(tidy(wilcox.test(.$`FALSE`, .$`TRUE`, data=., 
                       alternative = test_type, paired = TRUE))) %>% 
        mutate(is_eff = p.value<signif_level) %>% 
        select(age,gender,chronic_count,
               is_lowincome,rural,region,crd,
               is_eff) %>% data.frame(x) %>% 
    select(age,gender,chronic_count,
           is_lowincome,rural,region,crd,
           n_eff,n_not_eff,n,eff_prop,
           is_eff)
t=list()
tmp=list()
for(i in 1:4){
        tmp[[i]] = not_control %>% filter(crd == i)
        
        df = tmp[[i]] %>% group_by(after_crd,id) %>% 
                summarize(avg_mthly_claim = mean(agg_claim)) %>% 
                spread(key=after_crd,value=avg_mthly_claim) %>% ungroup()
        
        is_effective=ifelse(df$`TRUE`<df$`FALSE`,"Y","N")
        
        df = df %>% cbind(is_eff=is_effective)
        df = df %>% inner_join(demo,by="id") %>% 
                select(-c("FALSE","TRUE","id","mth_out","crd")) %>% 
                mutate(is_lowincome=as.factor(is_lowincome),
                       region=as.factor(region),
                       rural=as.factor(rural),
                       gender=as.factor(gender)
                )
        
        t[[i]] = C5.0(is_eff~.-is_eff, data=df)
        plot(t[[i]])
        print(summary(t[[i]]))
}

## 
## Call:
## C5.0.formula(formula = is_eff ~ . - is_eff, data = df)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Mon Mar  2 03:24:25 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 1430 cases (7 attributes) from undefined.data
## 
## Decision tree:
## 
## is_lowincome = FALSE:
## :...region in {Northeast,Southwest}: N (324/130)
## :   region = Northwest:
## :   :...chronic_count <= 1: N (39/6)
## :   :   chronic_count > 1:
## :   :   :...rural in {Rural,Semi-Rural,Urban}: N (46/14)
## :   :       rural = Suburban: Y (22/8)
## :   region = Central:
## :   :...rural in {Rural,Semi-Rural}: N (68/32)
## :   :   rural = Suburban: Y (77/30)
## :   :   rural = Urban:
## :   :   :...chronic_count > 4: N (6)
## :   :       chronic_count <= 4:
## :   :       :...gender = F: Y (64/28)
## :   :           gender = M: N (25/7)
## :   region = Southeast:
## :   :...gender = M: Y (83/39)
## :       gender = F:
## :       :...rural in {Suburban,Urban}: N (181/72)
## :           rural = Rural:
## :           :...age <= 73: N (11/3)
## :           :   age > 73: Y (2)
## :           rural = Semi-Rural:
## :           :...age <= 73: N (37/12)
## :               age > 73: Y (11/2)
## is_lowincome = TRUE:
## :...chronic_count > 4: N (31/9)
##     chronic_count <= 4:
##     :...rural = Semi-Rural: Y (62/24)
##         rural = Rural:
##         :...region in {Northeast,Southwest}: N (7/2)
##         :   region = Northwest: Y (5/1)
##         :   region = Central:
##         :   :...age <= 64: N (7/1)
##         :   :   age > 64: Y (5/1)
##         :   region = Southeast:
##         :   :...chronic_count <= 2: N (7/2)
##         :       chronic_count > 2: Y (6/1)
##         rural = Urban:
##         :...chronic_count <= 0:
##         :   :...age <= 77: Y (28/11)
##         :   :   age > 77: N (4/1)
##         :   chronic_count > 0:
##         :   :...age <= 66: N (71/25)
##         :       age > 66: Y (65/29)
##         rural = Suburban:
##         :...region in {Central,Southwest}: Y (39/16)
##             region = Northwest:
##             :...chronic_count <= 1: N (3)
##             :   chronic_count > 1: Y (10/2)
##             region = Northeast:
##             :...age <= 50: N (3)
##             :   age > 50:
##             :   :...chronic_count <= 3: Y (17/4)
##             :       chronic_count > 3: N (4/1)
##             region = Southeast:
##             :...chronic_count <= 1: N (26/10)
##                 chronic_count > 1:
##                 :...chronic_count <= 3: Y (26/7)
##                     chronic_count > 3: N (8/2)
## 
## 
## Evaluation on training data (1430 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      36  532(37.2%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     579   203    (a): class N
##     329   319    (b): class Y
## 
## 
##  Attribute usage:
## 
##  100.00% is_lowincome
##   81.75% region
##   66.64% rural
##   44.48% chronic_count
##   28.95% gender
##   18.53% age
## 
## 
## Time: 0.0 secs

## 
## Call:
## C5.0.formula(formula = is_eff ~ . - is_eff, data = df)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Mon Mar  2 03:24:26 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 1509 cases (7 attributes) from undefined.data
## 
## Decision tree:
## 
## is_lowincome = TRUE:
## :...rural = Suburban: N (283/120)
## :   rural = Rural:
## :   :...gender = F: Y (16/5)
## :   :   gender = M:
## :   :   :...chronic_count <= 0: Y (4)
## :   :       chronic_count > 0: N (51/19)
## :   rural = Semi-Rural:
## :   :...region in {Northeast,Southeast}: Y (101/46)
## :   :   region in {Northwest,Southwest}: N (26/9)
## :   :   region = Central:
## :   :   :...chronic_count <= 0: Y (10/2)
## :   :       chronic_count > 0: N (41/18)
## :   rural = Urban:
## :   :...region in {Central,Northwest}: Y (143/62)
## :       region = Northeast: N (58/25)
## :       region = Southeast:
## :       :...chronic_count <= 4: Y (82/40)
## :       :   chronic_count > 4: N (9)
## :       region = Southwest:
## :       :...gender = F: N (6/2)
## :           gender = M: Y (33/14)
## is_lowincome = FALSE:
## :...chronic_count <= 1: N (256/96)
##     chronic_count > 1:
##     :...rural = Rural: N (38/14)
##         rural = Suburban:
##         :...chronic_count <= 2: Y (52/16)
##         :   chronic_count > 2:
##         :   :...age <= 64: Y (31/14)
##         :       age > 64: N (33/11)
##         rural = Semi-Rural:
##         :...chronic_count <= 2: N (31/9)
##         :   chronic_count > 2:
##         :   :...chronic_count <= 3: Y (21/10)
##         :       chronic_count > 3:
##         :       :...age <= 66: Y (9/1)
##         :           age > 66: N (14/5)
##         rural = Urban:
##         :...region in {Northeast,Southwest}: N (55/20)
##             region = Northwest: Y (11/5)
##             region = Southeast:
##             :...age <= 78: Y (47/22)
##             :   age > 78: N (5)
##             region = Central:
##             :...age > 71: Y (11/1)
##                 age <= 71:
##                 :...chronic_count <= 4: N (27/11)
##                     chronic_count > 4: Y (5/1)
## 
## 
## Evaluation on training data (1509 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      30  598(39.6%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     574   239    (a): class N
##     359   337    (b): class Y
## 
## 
##  Attribute usage:
## 
##  100.00% is_lowincome
##   83.04% rural
##   55.86% chronic_count
##   44.40% region
##   12.06% age
##    7.29% gender
## 
## 
## Time: 0.0 secs

## 
## Call:
## C5.0.formula(formula = is_eff ~ . - is_eff, data = df)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Mon Mar  2 03:24:27 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 777 cases (7 attributes) from undefined.data
## 
## Decision tree:
## 
## chronic_count <= 2:
## :...rural = Rural:
## :   :...age <= 63: N (13/1)
## :   :   age > 63: Y (28/11)
## :   rural = Suburban:
## :   :...is_lowincome = FALSE:
## :   :   :...region in {Northwest,Southeast}: N (47/18)
## :   :   :   region = Central:
## :   :   :   :...age <= 65: N (15/5)
## :   :   :   :   age > 65: Y (11/1)
## :   :   :   region = Northeast:
## :   :   :   :...age <= 56: Y (5/1)
## :   :   :   :   age > 56: N (13/5)
## :   :   :   region = Southwest:
## :   :   :   :...chronic_count <= 0: N (2)
## :   :   :       chronic_count > 0:
## :   :   :       :...chronic_count <= 1: Y (4/1)
## :   :   :           chronic_count > 1: N (4/1)
## :   :   is_lowincome = TRUE:
## :   :   :...region in {Central,Northwest}: N (14/5)
## :   :       region = Northeast:
## :   :       :...chronic_count <= 0: Y (4)
## :   :       :   chronic_count > 0: N (10/4)
## :   :       region = Southwest:
## :   :       :...chronic_count <= 0: Y (2)
## :   :       :   chronic_count > 0: N (4/1)
## :   :       region = Southeast:
## :   :       :...chronic_count <= 1: Y (10/1)
## :   :           chronic_count > 1:
## :   :           :...age <= 66: Y (4)
## :   :               age > 66: N (3)
## :   rural = Semi-Rural:
## :   :...region = Central: N (18/5)
## :   :   region = Southwest: Y (15/3)
## :   :   region = Northeast:
## :   :   :...chronic_count <= 0: Y (3)
## :   :   :   chronic_count > 0: N (16/5)
## :   :   region = Northwest:
## :   :   :...is_lowincome = TRUE: N (4/1)
## :   :   :   is_lowincome = FALSE:
## :   :   :   :...chronic_count <= 0: N (3/1)
## :   :   :       chronic_count > 0: Y (6/1)
## :   :   region = Southeast:
## :   :   :...chronic_count <= 0: Y (5)
## :   :       chronic_count > 0:
## :   :       :...chronic_count <= 1: Y (8/2)
## :   :           chronic_count > 1:
## :   :           :...age > 77: Y (3)
## :   :               age <= 77:
## :   :               :...age <= 60: Y (7/2)
## :   :                   age > 60: N (7)
## :   rural = Urban:
## :   :...region = Central:
## :       :...gender = F: Y (28/10)
## :       :   gender = M: N (25/9)
## :       region = Northwest:
## :       :...gender = F: N (10/3)
## :       :   gender = M:
## :       :   :...is_lowincome = FALSE: Y (2)
## :       :       is_lowincome = TRUE: N (3/1)
## :       region = Southwest:
## :       :...gender = M: Y (11/3)
## :       :   gender = F:
## :       :   :...chronic_count <= 0: Y (4/1)
## :       :       chronic_count > 0: N (12/3)
## :       region = Northeast:
## :       :...is_lowincome = FALSE:
## :       :   :...gender = F:
## :       :   :   :...chronic_count <= 1: N (12/4)
## :       :   :   :   chronic_count > 1: Y (8/3)
## :       :   :   gender = M:
## :       :   :   :...age <= 60: N (2)
## :       :   :       age > 60: Y (2)
## :       :   is_lowincome = TRUE:
## :       :   :...chronic_count <= 0: N (3)
## :       :       chronic_count > 0:
## :       :       :...age <= 65: Y (5/1)
## :       :           age > 65: N (5/1)
## :       region = Southeast:
## :       :...is_lowincome = TRUE:
## :           :...age <= 66: Y (8)
## :           :   age > 66:
## :           :   :...age <= 82: N (13/3)
## :           :       age > 82: Y (2)
## :           is_lowincome = FALSE:
## :           :...chronic_count <= 0:
## :               :...gender = M: N (8/3)
## :               :   gender = F:
## :               :   :...age <= 61: Y (4)
## :               :       age > 61: N (4/1)
## :               chronic_count > 0:
## :               :...gender = F: N (23/7)
## :                   gender = M:
## :                   :...age <= 71: Y (5/1)
## :                       age > 71: N (3)
## chronic_count > 2:
## :...region = Northwest: N (23/6)
##     region = Northeast:
##     :...rural = Semi-Rural: Y (17/6)
##     :   rural = Urban: N (26/5)
##     :   rural = Rural:
##     :   :...chronic_count > 4: N (2)
##     :   :   chronic_count <= 4:
##     :   :   :...age <= 72: N (2)
##     :   :       age > 72: Y (2)
##     :   rural = Suburban:
##     :   :...age > 73: N (4)
##     :       age <= 73:
##     :       :...gender = F: N (3/1)
##     :           gender = M: Y (4/1)
##     region = Southwest:
##     :...chronic_count > 4: N (8/1)
##     :   chronic_count <= 4:
##     :   :...rural = Rural: N (3/1)
##     :       rural = Semi-Rural:
##     :       :...age <= 64: Y (2)
##     :       :   age > 64: N (3/1)
##     :       rural = Suburban:
##     :       :...is_lowincome = FALSE: N (3/1)
##     :       :   is_lowincome = TRUE: Y (6/1)
##     :       rural = Urban:
##     :       :...is_lowincome = FALSE: Y (8/2)
##     :           is_lowincome = TRUE: N (6)
##     region = Central:
##     :...rural = Rural:
##     :   :...chronic_count <= 3: Y (4/1)
##     :   :   chronic_count > 3: N (8/2)
##     :   rural = Suburban:
##     :   :...age <= 55: N (4)
##     :   :   age > 55: Y (15/6)
##     :   rural = Semi-Rural:
##     :   :...age <= 66: N (9)
##     :   :   age > 66:
##     :   :   :...chronic_count > 3: Y (2)
##     :   :       chronic_count <= 3:
##     :   :       :...age <= 75: N (2)
##     :   :           age > 75: Y (2)
##     :   rural = Urban:
##     :   :...chronic_count <= 3: N (14/6)
##     :       chronic_count > 3:
##     :       :...chronic_count > 4: Y (9/4)
##     :           chronic_count <= 4:
##     :           :...age <= 60: N (5)
##     :               age > 60: Y (6/1)
##     region = Southeast:
##     :...is_lowincome = FALSE: N (57/21)
##         is_lowincome = TRUE:
##         :...age > 78: N (3)
##             age <= 78:
##             :...rural = Rural: Y (0)
##                 rural = Semi-Rural:
##                 :...chronic_count <= 3: N (2)
##                 :   chronic_count > 3: Y (4/1)
##                 rural = Urban:
##                 :...age <= 51: N (2)
##                 :   age > 51: Y (8/1)
##                 rural = Suburban:
##                 :...chronic_count <= 3: Y (4/1)
##                     chronic_count > 3:
##                     :...age <= 69: N (2)
##                         age > 69: Y (3/1)
## 
## 
## Evaluation on training data (777 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      92  200(25.7%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     355    68    (a): class N
##     132   222    (b): class Y
## 
## 
##  Attribute usage:
## 
##  100.00% chronic_count
##   94.72% region
##   88.29% rural
##   49.55% is_lowincome
##   32.82% age
##   22.27% gender
## 
## 
## Time: 0.0 secs

## 
## Call:
## C5.0.formula(formula = is_eff ~ . - is_eff, data = df)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Mon Mar  2 03:24:34 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 1207 cases (7 attributes) from undefined.data
## 
## Decision tree:
## 
## is_lowincome = FALSE:
## :...chronic_count <= 1: N (191/71)
## :   chronic_count > 1:
## :   :...rural in {Semi-Rural,Urban}: Y (205/92)
## :       rural = Rural:
## :       :...chronic_count <= 4: N (18/6)
## :       :   chronic_count > 4: Y (4)
## :       rural = Suburban:
## :       :...region in {Northwest,Southeast,Southwest}: N (52/19)
## :           region = Central:
## :           :...age <= 73: N (18/2)
## :           :   age > 73: Y (5/1)
## :           region = Northeast:
## :           :...age <= 71: Y (22/5)
## :               age > 71: N (3)
## is_lowincome = TRUE:
## :...rural = Semi-Rural: Y (126/51)
##     rural = Rural:
##     :...gender = F: Y (10/4)
##     :   gender = M: N (45/17)
##     rural = Suburban:
##     :...region in {Central,Southwest}: N (83/38)
##     :   region in {Northeast,Northwest}: Y (75/29)
##     :   region = Southeast:
##     :   :...age <= 71: Y (42/17)
##     :       age > 71:
##     :       :...chronic_count <= 1: Y (6/2)
##     :           chronic_count > 1: N (12/1)
##     rural = Urban:
##     :...region = Central: Y (80/31)
##         region = Northwest:
##         :...chronic_count <= 3: Y (23/10)
##         :   chronic_count > 3: N (6/2)
##         region = Southeast:
##         :...age <= 50: Y (11/1)
##         :   age > 50: N (79/33)
##         region = Southwest:
##         :...gender = F: N (4/1)
##         :   gender = M: Y (25/8)
##         region = Northeast:
##         :...gender = F: Y (7/1)
##             gender = M:
##             :...age <= 67: Y (32/10)
##                 age > 67: N (23/5)
## 
## 
## Evaluation on training data (1207 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      27  457(37.9%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     339   262    (a): class N
##     195   411    (b): class Y
## 
## 
##  Attribute usage:
## 
##  100.00% is_lowincome
##   84.18% rural
##   50.37% region
##   46.81% chronic_count
##   20.96% age
##   12.10% gender
## 
## 
## Time: 0.0 secs